Libraries

set.seed(895893)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
library(broom)
library(papaja)
## Loading required package: tinylabels

Introduction

This document outlines using the suggested procedure to simulate the number of participants necessary to accurately measure items. There are two key issues these ideas should address that we know about power:

  1. we should see differences in projected sample sizes based on the variability in the variance for those items (i.e., heterogeneity should increase projected sample size)
  2. we should see projected sample sizes that “level off” when pilot data increases. As with regular power estimates, studies can be “overpowered” to detect an effect, and this same idea should be present. For example, if one has a 500 person pilot study, our simulations should suggest a point at which items are likely measured well, which may have happened well before 500.

Explain the Procedure

  1. Take your original pilot data.
  2. Find a critical score for an accurately measured item using SE.
  3. Sample from your pilot data using sample sizes starting at 20 participants and increase in units of 5 up to a value you feel is the maximum sample size.
  4. Calculate the number of items that meet your critical score at each sample size.
  5. Find the smallest sample size for 80%, 85%, 90%, and 95% power.
  6. Designate your minimum sample size from these values, the stopping rule used from step 2, and the maximum sample size from these values.

Method

Data Simulation

Population. The data was simulated using the rnorm function assuming a normal distribution for 30 scale type items. Each population was simulated with 1000 data points. No items were rounded for this simulation.

First, the scale of the data was manipulated by creating three sets of scales. The first scale was mimicked after small rating scales (i.e., 1-7 type style) using a \(\mu\) = 4 with a \(\sigma\) = .25 around the mean to create item mean variability. The second scale included a larger potential distribution of scores with a \(\mu\) = 50 (\(\sigma\) = 10) imitating a 0-100 scale. Last, the final scale included a \(\mu\) = 1000 (\(\sigma\) = 150) simulating a study that may include response latency data in the milliseconds. While there are many potential scales, these three represent a large number of potential variables in the social sciences. As we are suggesting variances as a key factor for estimating sample sizes, the scale of the data is influential on the amount of potential variance. Smaller ranges of data (1-7) cannot necessarily have the same variance as larger ranges (0-100).

Next, item variance heterogeneity was included by manipulating the potential \(\sigma\) for each individual item. For small scales, the \(\sigma\) = 2 points with a variability of .2, .4, and .8 for low, medium, and high heterogeneity in the variances between items. For the medium scale of data, \(\sigma\) = 25 with a variance of 4, 8, and 16. Last, for the large scale of data, \(\sigma\) = 400 with a variance of 50, 100, and 200 for heterogeneity.

# small potential variability overall, sort of 1-7ish scale
mu1 <- rnorm(30, 4, .25)
sigma1.1 <- rnorm(30, 2, .2)
sigma2.1 <- rnorm(30, 2, .4)
sigma3.1 <- rnorm(30, 2, .8)

# medium potential variability 0 to 100 scale
mu2 <- rnorm(30, 50, 10)
sigma1.2 <- rnorm(30, 25, 4)
sigma2.2 <- rnorm(30, 25, 8)
sigma3.2 <- rnorm(30, 25, 16)

while(sum(sigma3.2 < 0) > 0){
  sigma3.2 <- rnorm(30, 25, 16)
}

# large potential variability in the 1000s scale
mu3 <- rnorm(30, 1000, 150)
sigma1.3 <- rnorm(30, 400, 50)
sigma2.3 <- rnorm(30, 400, 100)
sigma3.3 <- rnorm(30, 400, 200)

while(sum(sigma3.3 < 0) > 0){
  sigma3.3 <- rnorm(30, 400, 200)
}

population1 <- data.frame(
  item = rep(1:30, 1000*3),
  scale = rep(1:3, each = 1000*30),
  score = c(rnorm(1000*30, mean = mu1, sd = sigma1.1),
            rnorm(1000*30, mean = mu2, sd = sigma1.2),
            rnorm(1000*30, mean = mu3, sd = sigma1.3))
  )

population2 <- data.frame(
  item = rep(1:30, 1000*3),
  scale = rep(1:3, each = 1000*30),
  score = c(rnorm(1000*30, mean = mu1, sd = sigma2.1),
            rnorm(1000*30, mean = mu2, sd = sigma2.2),
            rnorm(1000*30, mean = mu3, sd = sigma2.3))
  )

population3 <- data.frame(
  item = rep(1:30, 1000*3),
  scale = rep(1:3, each = 1000*30),
  score = c(rnorm(1000*30, mean = mu1, sd = sigma3.1),
            rnorm(1000*30, mean = mu2, sd = sigma3.2),
            rnorm(1000*30, mean = mu3, sd = sigma3.3))
  )

#evidence that they are simulated correctly
tapply(population1$score, list(population1$item, 
                               population1$scale), mean)
##           1        2         3
## 1  3.902902 30.46267  775.0367
## 2  3.641517 46.46319 1067.1570
## 3  3.812100 42.52329 1295.5779
## 4  3.854610 58.00755 1096.0101
## 5  3.433522 42.00242  872.0807
## 6  4.143272 71.21973 1148.3532
## 7  4.107761 54.17529 1050.0229
## 8  4.434764 37.40733  958.1336
## 9  3.619174 40.06408  983.3803
## 10 4.006974 36.93354 1198.8293
## 11 3.267784 49.48155 1225.3493
## 12 3.813818 45.54308 1329.6380
## 13 4.269054 55.74713  968.1331
## 14 3.960559 41.97700 1093.0687
## 15 3.960783 56.92441  958.7610
## 16 3.839746 32.21330 1048.6222
## 17 3.966508 69.81192  835.1373
## 18 4.079169 40.03570 1258.1913
## 19 4.357796 50.84897 1199.0519
## 20 3.961464 44.77769  818.1981
## 21 3.965871 41.93207 1157.8404
## 22 4.347589 51.05360  874.5282
## 23 4.045330 38.66577 1068.7840
## 24 4.239570 52.51189  988.9018
## 25 3.721038 59.13788  832.1767
## 26 3.858812 50.79533 1092.1147
## 27 4.219286 48.45677  935.6302
## 28 3.862671 51.12075  863.7208
## 29 3.704449 62.41377  782.8347
## 30 4.492225 58.47688  806.9001
tapply(population1$score, list(population1$item,
                               population1$scale), sd) 
##           1        2        3
## 1  1.890627 26.34446 447.9221
## 2  2.058239 18.48790 413.3759
## 3  1.948857 26.27541 417.0936
## 4  1.991223 24.31908 372.9159
## 5  1.969047 22.25882 315.2520
## 6  1.883817 19.87603 314.8351
## 7  1.833318 32.14773 350.0537
## 8  2.122209 21.25980 448.5052
## 9  2.306645 23.92401 402.1434
## 10 1.754506 24.73113 391.3724
## 11 1.587011 24.22367 365.5963
## 12 1.802461 30.18503 421.5994
## 13 1.999581 34.07362 371.7095
## 14 1.789389 27.55662 307.0195
## 15 2.007439 30.14664 431.6300
## 16 2.050786 25.18630 393.5207
## 17 2.241356 23.31799 380.9175
## 18 2.214826 23.91088 431.9195
## 19 2.043198 28.51435 461.5439
## 20 1.823680 27.28226 387.0149
## 21 2.218899 24.29106 356.7623
## 22 1.784586 27.56687 444.6985
## 23 1.787658 31.03734 387.7521
## 24 2.032558 31.50736 449.9648
## 25 2.135126 25.04701 413.4826
## 26 1.902208 27.32053 383.7975
## 27 2.167401 25.17818 291.1477
## 28 2.007897 22.81182 380.9693
## 29 2.293348 27.51165 293.8116
## 30 1.541930 22.79358 378.0659
tapply(population2$score, list(population2$item, 
                               population2$scale), mean)
##           1        2         3
## 1  3.950501 33.00300  780.0960
## 2  3.750543 47.68966 1089.9544
## 3  3.735677 42.45975 1298.4786
## 4  3.988222 58.47058 1094.0806
## 5  3.334382 41.50217  843.9473
## 6  4.158273 68.92020 1135.7110
## 7  4.175987 53.60701 1059.9587
## 8  4.319914 39.82013  966.4655
## 9  3.580827 40.23406  971.7099
## 10 4.110682 36.55129 1207.6237
## 11 3.259170 49.53321 1227.2703
## 12 3.768975 44.31530 1325.8837
## 13 4.254734 56.24424  960.0673
## 14 3.929969 43.36030 1101.0468
## 15 4.083672 56.14094  944.9630
## 16 3.908679 32.22729 1046.5041
## 17 3.961111 71.02240  822.8286
## 18 4.106718 39.97576 1255.1825
## 19 4.325670 51.90323 1202.2096
## 20 4.054181 45.06967  803.6126
## 21 4.059961 40.96535 1149.1808
## 22 4.227156 51.07047  866.0613
## 23 4.048451 37.51939 1061.6135
## 24 4.116550 52.61103  975.8781
## 25 3.567186 57.94929  818.8236
## 26 3.890774 49.19034 1126.9975
## 27 4.290088 50.23184  955.9890
## 28 3.834475 50.37329  852.1188
## 29 3.685025 60.27283  772.6668
## 30 4.433935 57.36398  798.2312
tapply(population2$score, list(population2$item,
                               population2$scale), sd) 
##           1        2        3
## 1  2.763139 40.60330 272.9881
## 2  1.390085 31.57465 376.1165
## 3  1.577165 16.80232 217.6260
## 4  2.255374 10.97166 265.3098
## 5  2.409540 39.88434 368.8119
## 6  1.674612 29.04493 499.2654
## 7  2.026715 16.81706 203.5280
## 8  1.820940 41.46035 367.9732
## 9  2.493294 36.34031 270.3850
## 10 2.016523 17.23096 389.4101
## 11 2.468095 22.94024 463.2799
## 12 1.551968 31.83887 396.8280
## 13 2.105696 15.12426 198.6223
## 14 1.528888 31.08692 564.4742
## 15 2.277393 29.09624 329.6530
## 16 1.646159 27.38014 394.2811
## 17 2.355252 50.17814 385.7395
## 18 1.533480 13.74537 396.6787
## 19 2.019509 37.31631 425.8186
## 20 1.924035 27.78527 280.9045
## 21 1.853170 28.62314 410.2052
## 22 1.998172 33.01822 463.8064
## 23 2.004897 23.36831 378.2983
## 24 2.744461 27.39842 528.3982
## 25 2.224172 21.45756 546.9542
## 26 1.384873 16.25205 288.8152
## 27 2.655162 41.85640 203.0837
## 28 1.693612 19.18822 518.2790
## 29 1.962601 39.87810 524.9772
## 30 1.769749 23.70109 435.7949
tapply(population3$score, list(population3$item, 
                               population3$scale), mean)
##           1        2         3
## 1  3.855587 32.97237  762.9452
## 2  3.868187 46.99772 1089.5423
## 3  3.845957 43.32952 1292.2083
## 4  3.959157 57.84889 1102.7523
## 5  3.584825 41.60252  893.0890
## 6  3.963773 69.25035 1163.2111
## 7  4.107680 55.74294 1053.3420
## 8  4.215722 36.27618  952.7271
## 9  3.702241 40.65176 1012.1138
## 10 3.993711 36.36285 1211.2836
## 11 3.269975 49.06809 1229.8295
## 12 3.678456 44.27366 1350.8253
## 13 4.233037 56.38444  981.2822
## 14 3.912706 43.10341 1103.4023
## 15 4.054028 55.62390  959.7905
## 16 3.958513 33.88419 1069.0534
## 17 3.997336 70.16391  831.0627
## 18 4.163447 38.04008 1235.3849
## 19 4.243718 51.63930 1180.5805
## 20 4.011720 45.24226  813.5420
## 21 4.101582 41.20121 1122.5218
## 22 4.183665 50.81866  862.3160
## 23 4.171506 39.85052 1064.2565
## 24 4.020276 51.25713  985.8166
## 25 3.849307 58.62198  843.9054
## 26 3.807178 49.42927 1117.0703
## 27 4.115374 49.08685  962.5971
## 28 4.022901 49.30474  885.9961
## 29 3.832931 57.22959  804.6436
## 30 4.582119 57.19845  801.0198
tapply(population3$score, list(population3$item,
                               population3$scale), sd) 
##            1         2         3
## 1  3.2769848 30.592868 472.78086
## 2  2.7331542 27.606291 219.31130
## 3  2.2973465 11.310145 259.32081
## 4  2.3507781 23.973472 180.07712
## 5  1.7898735  4.842688 555.20067
## 6  1.9010021 19.076503 331.65811
## 7  3.3702103 38.460999 393.65848
## 8  1.9120117 27.669388 490.82736
## 9  3.2527780 42.160159 446.85460
## 10 2.0874351 32.874833 749.43323
## 11 2.7977082 20.573106 290.36279
## 12 3.3006312 31.424359 507.40278
## 13 1.1770792 39.776679 514.58535
## 14 2.3335247 15.357720 481.37062
## 15 0.7675617 43.340551  74.02248
## 16 1.5466065 46.567773 372.76795
## 17 1.7805748  7.299641 229.58366
## 18 2.6329110 41.760361 545.84021
## 19 4.0269547 16.713839 601.87632
## 20 0.8437944 15.940341 375.00585
## 21 1.2479703 26.052144 502.13185
## 22 2.2823698 33.476917 562.53933
## 23 2.6252197 42.923010 262.92805
## 24 5.1654318 19.085865 392.16348
## 25 1.6935013 33.603440 313.33447
## 26 1.4942078 19.737497 184.62965
## 27 2.7674848  9.092104 213.00917
## 28 2.8525696 36.119084 212.09303
## 29 1.8855775 46.882515 404.90381
## 30 1.8449062 11.442060 446.55408

Samples. Each population was then sampled as if a researcher was conducting a pilot study. The sample sizes started at 20 participants per item increasing in units of 5 up to 100 participants.

# create pilot samples from 20 to 100
samples1 <- samples2 <- samples3 <- list() 
sizes <- c(20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100)
for (i in 1:length(sizes)){
  samples1[[i]] <- population1 %>% 
    group_by(item, scale) %>% 
    slice_sample(n = sizes[i])
  
  samples2[[i]] <- population2 %>% 
    group_by(item, scale) %>% 
    slice_sample(n = sizes[i])
    
  samples3[[i]] <- population3 %>% 
    group_by(item, scale) %>% 
    slice_sample(n = sizes[i])
}

AIPE Criterions. The standard errors of each item were calculated to mimic the AIPE procedure of finding an appropriately small confidence interval, as standard error functions as the main component in the formula for normal distribution confidence intervals. Standard errors were calculated at each decile of the items up to 90% (0% smallest SE, 10% …, 90% largest SE). The lower deciles would represent a strict criterion for accurate measurement, as many items would need smaller SEs to meet AIPE goals, while the higher deciles would represent less strict criterions for AIPE goals.

# calculate the SEs and the cutoff scores 
SES1 <- SES2 <- SES3 <- list()
cutoffs1 <- cutoffs2 <- cutoffs3 <- list()
sd_items1 <- sd_items2 <- sd_items3 <- list()
for (i in 1:length(samples1)){
  sd_items1[[i]] <- samples1[[i]] %>% group_by(item, scale) %>% 
    summarize(sd = sd(score), .groups = "keep") %>% 
    ungroup() %>% group_by(scale) %>% summarize(sd_item = sd(sd))
  sd_items2[[i]] <- samples2[[i]] %>% group_by(item, scale) %>% 
    summarize(sd = sd(score), .groups = "keep") %>% 
    ungroup() %>% group_by(scale) %>% summarize(sd_item = sd(sd))
  sd_items3[[i]] <- samples3[[i]] %>% group_by(item, scale) %>% 
    summarize(sd = sd(score), .groups = "keep") %>% 
    ungroup() %>% group_by(scale) %>% summarize(sd_item = sd(sd))
  
  SES1[[i]] <- tapply(samples1[[i]]$score,
                     list(samples1[[i]]$item,
                          samples1[[i]]$scale),
                     function (x){ sd(x)/sqrt(length(x))})
  SES2[[i]] <- tapply(samples2[[i]]$score,
                   list(samples2[[i]]$item,
                        samples2[[i]]$scale),
                   function (x){ sd(x)/sqrt(length(x))})
  SES3[[i]] <- tapply(samples3[[i]]$score,
                 list(samples2[[i]]$item,
                      samples2[[i]]$scale),
                 function (x){ sd(x)/sqrt(length(x))})

  cutoffs1[[i]] <- apply(as.data.frame(SES1[[i]]), 2, 
                         quantile, 
                         probs = seq(0, .9, by = .1))
  cutoffs2[[i]] <- apply(as.data.frame(SES2[[i]]), 2, 
                         quantile, 
                         probs = seq(0, .9, by = .1))
  cutoffs3[[i]] <- apply(as.data.frame(SES3[[i]]), 2, 
                         quantile, 
                         probs = seq(0, .9, by = .1))
}

AIPE Simulation

In this section, we simulate what a researcher might do if they follow our suggested application of AIPE to sample size planning based on well measured items. Assuming each pilot sample represents a dataset a researcher has collected, we will simulate samples of 20 to 500 to determine what the new sample size suggestion would be. We assume that samples over 500 may be considered too large for many researchers who do not work in teams or have participant funds. The standard error of each item was calculated for each suggested sample size by pilot sample size by population type.

# sequence of sample sizes to try
samplesize_values <- seq(20, 500, 5)

# place to store everything
sampled_values1 <- sampled_values2 <- sampled_values3 <- list()

# loop over the samples
for (i in 1:length(samples1)){
  
  # create a blank table for us to save the values in 
  sim_table1 <- matrix(NA, 
                      nrow = length(samplesize_values), 
                      ncol = 30*3)
  
  # make it a data frame
  sim_table1 <- sim_table2 <- sim_table3 <- as.data.frame(sim_table1)
  
  # add a place for sample size values 
  sim_table1$sample_size <- sim_table2$sample_size <- sim_table3$sample_size <- NA
  
  # loop over pilot sample sizes
  for (q in 1:length(samplesize_values)){
      
    # temp dataframe that samples and summarizes
    temp <- samples1[[i]] %>% 
      group_by(item, scale) %>% 
      slice_sample(n = samplesize_values[q], replace = T) %>% 
      summarize(se = sd(score)/sqrt(length(score)),
                .groups = "keep")
    
    sim_table1[q, 1:90] <- temp$se
    sim_table1[q, 91] <- samplesize_values[q]
    
    temp <- samples2[[i]] %>% 
      group_by(item, scale) %>% 
      slice_sample(n = samplesize_values[q], replace = T) %>% 
      summarize(se = sd(score)/sqrt(length(score)),
                .groups = "keep")
    
    sim_table2[q, 1:90] <- temp$se
    sim_table2[q, 91] <- samplesize_values[q]
    
    temp <- samples3[[i]] %>% 
      group_by(item, scale) %>% 
      slice_sample(n = samplesize_values[q], replace = T) %>% 
      summarize(se = sd(score)/sqrt(length(score)),
                .groups = "keep")
    
    sim_table3[q, 1:90] <- temp$se
    sim_table3[q, 91] <- samplesize_values[q]
    
    } # end pilot sample loop 
  
  sampled_values1[[i]] <- sim_table1
  sampled_values2[[i]] <- sim_table2
  sampled_values3[[i]] <- sim_table3

} # end all sample loop 

Next, the percent of items that fall below the cutoff scores, and thus, would be considered “well-measured” were calculated for each decile by sample.

summary_list1 <- summary_list2 <- summary_list3 <- list()
for (i in 1:length(sampled_values1)){
  
  # summary list 1 ----
  summary_list1[[i]] <- sampled_values1[[i]] %>% 
    pivot_longer(cols = -c(sample_size)) %>% 
    rename(item = name, se = value) %>% 
    mutate(scale = rep(1:3, 30*length(samplesize_values))) %>% 
    mutate(item = rep(rep(1:30, each = 3), length(samplesize_values))) 
    
  # cut offs for 1
  temp1.1 <- summary_list1[[i]] %>% 
    filter(scale == "1") %>% 
    group_by(sample_size, scale) %>% 
    summarize(Percent_Below0 = sum(se <= cutoffs1[[i]][1, 1])/30,
           Percent_Below10 = sum(se <= cutoffs1[[i]][2, 1])/30,
           Percent_Below20 = sum(se <= cutoffs1[[i]][3, 1])/30,
           Percent_Below30 = sum(se <= cutoffs1[[i]][4, 1])/30,
           Percent_Below40 = sum(se <= cutoffs1[[i]][5, 1])/30,
           Percent_Below50 = sum(se <= cutoffs1[[i]][6, 1])/30, 
           Percent_Below60 = sum(se <= cutoffs1[[i]][7, 1])/30, 
           Percent_Below70 = sum(se <= cutoffs1[[i]][8, 1])/30, 
           Percent_Below80 = sum(se <= cutoffs1[[i]][9, 1])/30, 
           Percent_Below90 = sum(se <= cutoffs1[[i]][10, 1])/30, 
           .groups = "keep") %>% 
    mutate(original_n = sizes[i], 
           source = "low")
    
  temp1.2 <- summary_list1[[i]] %>% 
    filter(scale == "2") %>% 
    group_by(sample_size, scale) %>% 
    summarize(Percent_Below0 = sum(se <= cutoffs1[[i]][1, 2])/30,
           Percent_Below10 = sum(se <= cutoffs1[[i]][2, 2])/30,
           Percent_Below20 = sum(se <= cutoffs1[[i]][3, 2])/30,
           Percent_Below30 = sum(se <= cutoffs1[[i]][4, 2])/30,
           Percent_Below40 = sum(se <= cutoffs1[[i]][5, 2])/30,
           Percent_Below50 = sum(se <= cutoffs1[[i]][6, 2])/30, 
           Percent_Below60 = sum(se <= cutoffs1[[i]][7, 2])/30, 
           Percent_Below70 = sum(se <= cutoffs1[[i]][8, 2])/30, 
           Percent_Below80 = sum(se <= cutoffs1[[i]][9, 2])/30, 
           Percent_Below90 = sum(se <= cutoffs1[[i]][10, 2])/30, 
           .groups = "keep") %>% 
    mutate(original_n = sizes[i],
           source = "low")
  
  temp1.3 <- summary_list1[[i]] %>% 
    filter(scale == "3") %>% 
    group_by(sample_size, scale) %>% 
    summarize(Percent_Below0 = sum(se <= cutoffs1[[i]][1, 3])/30,
           Percent_Below10 = sum(se <= cutoffs1[[i]][2, 3])/30,
           Percent_Below20 = sum(se <= cutoffs1[[i]][3, 3])/30,
           Percent_Below30 = sum(se <= cutoffs1[[i]][4, 3])/30,
           Percent_Below40 = sum(se <= cutoffs1[[i]][5, 3])/30,
           Percent_Below50 = sum(se <= cutoffs1[[i]][6, 3])/30, 
           Percent_Below60 = sum(se <= cutoffs1[[i]][7, 3])/30, 
           Percent_Below70 = sum(se <= cutoffs1[[i]][8, 3])/30, 
           Percent_Below80 = sum(se <= cutoffs1[[i]][9, 3])/30, 
           Percent_Below90 = sum(se <= cutoffs1[[i]][10, 3])/30, 
           .groups = "keep") %>% 
    mutate(original_n = sizes[i], 
           source = "low")
  
  #rejoin 
  summary_list1[[i]] <- bind_rows(temp1.1, temp1.2, temp1.3)
  
  # summary list 2 ----
  summary_list2[[i]] <- sampled_values2[[i]] %>% 
    pivot_longer(cols = -c(sample_size)) %>% 
    rename(item = name, se = value) %>% 
    mutate(scale = rep(1:3, 30*length(samplesize_values))) %>% 
    mutate(item = rep(rep(1:30, each = 3), length(samplesize_values)))
  
  # cut offs for 2
  temp2.1 <- summary_list2[[i]] %>% 
    filter(scale == "1") %>% 
    group_by(sample_size, scale) %>% 
    summarize(Percent_Below0 = sum(se <= cutoffs2[[i]][1, 1])/30,
           Percent_Below10 = sum(se <= cutoffs2[[i]][2, 1])/30,
           Percent_Below20 = sum(se <= cutoffs2[[i]][3, 1])/30,
           Percent_Below30 = sum(se <= cutoffs2[[i]][4, 1])/30,
           Percent_Below40 = sum(se <= cutoffs2[[i]][5, 1])/30,
           Percent_Below50 = sum(se <= cutoffs2[[i]][6, 1])/30, 
           Percent_Below60 = sum(se <= cutoffs2[[i]][7, 1])/30, 
           Percent_Below70 = sum(se <= cutoffs2[[i]][8, 1])/30, 
           Percent_Below80 = sum(se <= cutoffs2[[i]][9, 1])/30, 
           Percent_Below90 = sum(se <= cutoffs2[[i]][10, 1])/30, 
           .groups = "keep") %>% 
    mutate(original_n = sizes[i],
           source = "med")
    
  temp2.2 <- summary_list2[[i]] %>% 
    filter(scale == "2") %>% 
    group_by(sample_size, scale) %>% 
    summarize(Percent_Below0 = sum(se <= cutoffs2[[i]][1, 2])/30,
           Percent_Below10 = sum(se <= cutoffs2[[i]][2, 2])/30,
           Percent_Below20 = sum(se <= cutoffs2[[i]][3, 2])/30,
           Percent_Below30 = sum(se <= cutoffs2[[i]][4, 2])/30,
           Percent_Below40 = sum(se <= cutoffs2[[i]][5, 2])/30,
           Percent_Below50 = sum(se <= cutoffs2[[i]][6, 2])/30, 
           Percent_Below60 = sum(se <= cutoffs2[[i]][7, 2])/30, 
           Percent_Below70 = sum(se <= cutoffs2[[i]][8, 2])/30, 
           Percent_Below80 = sum(se <= cutoffs2[[i]][9, 2])/30, 
           Percent_Below90 = sum(se <= cutoffs2[[i]][10, 2])/30, 
           .groups = "keep") %>% 
    mutate(original_n = sizes[i], 
           source = "med")
  
  temp2.3 <- summary_list2[[i]] %>% 
    filter(scale == "3") %>% 
    group_by(sample_size, scale) %>% 
    summarize(Percent_Below0 = sum(se <= cutoffs2[[i]][1, 3])/30,
           Percent_Below10 = sum(se <= cutoffs2[[i]][2, 3])/30,
           Percent_Below20 = sum(se <= cutoffs2[[i]][3, 3])/30,
           Percent_Below30 = sum(se <= cutoffs2[[i]][4, 3])/30,
           Percent_Below40 = sum(se <= cutoffs2[[i]][5, 3])/30,
           Percent_Below50 = sum(se <= cutoffs2[[i]][6, 3])/30, 
           Percent_Below60 = sum(se <= cutoffs2[[i]][7, 3])/30, 
           Percent_Below70 = sum(se <= cutoffs2[[i]][8, 3])/30, 
           Percent_Below80 = sum(se <= cutoffs2[[i]][9, 3])/30, 
           Percent_Below90 = sum(se <= cutoffs2[[i]][10, 3])/30, 
           .groups = "keep") %>% 
    mutate(original_n = sizes[i],
           source = "med")
  
  #rejoin 
  summary_list2[[i]] <- bind_rows(temp2.1, temp2.2, temp2.3)
  
  # summary list 3 ----
  summary_list3[[i]] <- sampled_values3[[i]] %>% 
    pivot_longer(cols = -c(sample_size)) %>% 
    rename(item = name, se = value) %>% 
    mutate(scale = rep(1:3, 30*length(samplesize_values))) %>% 
    mutate(item = rep(rep(1:30, each = 3), length(samplesize_values)))
  
  # cut offs for 3 
  temp3.1 <- summary_list3[[i]] %>% 
    filter(scale == "1") %>% 
    group_by(sample_size, scale) %>% 
    summarize(Percent_Below0 = sum(se <= cutoffs3[[i]][1, 1])/30,
           Percent_Below10 = sum(se <= cutoffs3[[i]][2, 1])/30,
           Percent_Below20 = sum(se <= cutoffs3[[i]][3, 1])/30,
           Percent_Below30 = sum(se <= cutoffs3[[i]][4, 1])/30,
           Percent_Below40 = sum(se <= cutoffs3[[i]][5, 1])/30,
           Percent_Below50 = sum(se <= cutoffs3[[i]][6, 1])/30, 
           Percent_Below60 = sum(se <= cutoffs3[[i]][7, 1])/30, 
           Percent_Below70 = sum(se <= cutoffs3[[i]][8, 1])/30, 
           Percent_Below80 = sum(se <= cutoffs3[[i]][9, 1])/30, 
           Percent_Below90 = sum(se <= cutoffs3[[i]][10, 1])/30, 
           .groups = "keep") %>% 
    mutate(original_n = sizes[i],
           source = "high")
     
  temp3.2 <- summary_list3[[i]] %>% 
    filter(scale == "2") %>% 
    group_by(sample_size, scale) %>% 
    summarize(Percent_Below0 = sum(se <= cutoffs3[[i]][1, 2])/30,
           Percent_Below10 = sum(se <= cutoffs3[[i]][2, 2])/30,
           Percent_Below20 = sum(se <= cutoffs3[[i]][3, 2])/30,
           Percent_Below30 = sum(se <= cutoffs3[[i]][4, 2])/30,
           Percent_Below40 = sum(se <= cutoffs3[[i]][5, 2])/30,
           Percent_Below50 = sum(se <= cutoffs3[[i]][6, 2])/30, 
           Percent_Below60 = sum(se <= cutoffs3[[i]][7, 2])/30, 
           Percent_Below70 = sum(se <= cutoffs3[[i]][8, 2])/30, 
           Percent_Below80 = sum(se <= cutoffs3[[i]][9, 2])/30, 
           Percent_Below90 = sum(se <= cutoffs3[[i]][10, 2])/30, 
           .groups = "keep") %>% 
    mutate(original_n = sizes[i],
           source = "high")
  
  temp3.3 <- summary_list3[[i]] %>% 
    filter(scale == "3") %>% 
    group_by(sample_size, scale) %>% 
    summarize(Percent_Below0 = sum(se <= cutoffs3[[i]][1, 3])/30,
           Percent_Below10 = sum(se <= cutoffs3[[i]][2, 3])/30,
           Percent_Below20 = sum(se <= cutoffs3[[i]][3, 3])/30,
           Percent_Below30 = sum(se <= cutoffs3[[i]][4, 3])/30,
           Percent_Below40 = sum(se <= cutoffs3[[i]][5, 3])/30,
           Percent_Below50 = sum(se <= cutoffs3[[i]][6, 3])/30, 
           Percent_Below60 = sum(se <= cutoffs3[[i]][7, 3])/30, 
           Percent_Below70 = sum(se <= cutoffs3[[i]][8, 3])/30, 
           Percent_Below80 = sum(se <= cutoffs3[[i]][9, 3])/30, 
           Percent_Below90 = sum(se <= cutoffs3[[i]][10, 3])/30, 
           .groups = "keep") %>% 
    mutate(original_n = sizes[i],
           source = "high")
  
  #rejoin 
  summary_list3[[i]] <- bind_rows(temp3.1, temp3.2, temp3.3)

  }

summary_DF <- bind_rows(summary_list1, 
                        summary_list2, 
                        summary_list3)

From this data, we pinpoint the smallest suggested sample size at which 80%, 85%, 90%, and 95% of the items fall below the cutoff criterion. These values were chosen as popular measures of “power” in which one could determine the minimum suggested sample size (potentially 80% of items) and the maximum suggested sample size (potentially 90%).

summary_long_80 <- summary_DF %>% 
  pivot_longer(cols = -c(sample_size, original_n, source, scale)) %>% 
  filter(value >= .80) %>% 
  arrange(sample_size, original_n, source, scale, name) %>% 
  group_by(original_n, name, source, scale) %>% 
  slice_head(n = 1) %>% 
  mutate(power = 80)

summary_long_85 <- summary_DF %>% 
  pivot_longer(cols = -c(sample_size, original_n, source, scale)) %>% 
  filter(value >= .85) %>% 
  arrange(sample_size, original_n, source, scale, name) %>% 
  group_by(original_n, name, source, scale) %>% 
  slice_head(n = 1) %>% 
  mutate(power = 85)
  
summary_long_90 <- summary_DF %>% 
  pivot_longer(cols = -c(sample_size, original_n, source, scale)) %>% 
  filter(value >= .90) %>% 
  arrange(sample_size, original_n, source, scale, name) %>% 
  group_by(original_n, name, source, scale) %>% 
  slice_head(n = 1) %>% 
  mutate(power = 90)

summary_long_95 <- summary_DF %>% 
  pivot_longer(cols = -c(sample_size, original_n, source, scale)) %>% 
  filter(value >= .95) %>% 
  arrange(sample_size, original_n, source, scale, name) %>% 
  group_by(original_n, name, source, scale) %>% 
  slice_head(n = 1) %>% 
  mutate(power = 95)

summary_long <- rbind(summary_long_80, 
                      summary_long_85,
                      summary_long_90,
                      summary_long_95)

summary_long$source <- factor(summary_long$source, 
                              levels = c("low", "med", "high"),
                              labels = c("Low Variance", "Medium Variance", "High Variance"))

summary_long$scale2 <- factor(summary_long$scale, 
                              levels = c(1:3),
                              labels = c("Small Scale", "Medium Scale", "Large Scale"))

sd_items <- bind_rows(sd_items1, sd_items2, sd_items3)
sd_items$original_n <- rep(rep(sizes, each = 3), 3)
sd_items$source <- rep(c("Low Variance", "Medium Variance", "High Variance"), each = 3*length(sizes))
summary_long <- summary_long %>% 
  full_join(sd_items, 
            by = c("original_n" = "original_n", "scale" = "scale", 
                   "source" = "source"))

summary_long$name <- gsub("Percent_Below", "Decile ", summary_long$name)

Results

Differences in Item Variance

We examined if this procedure is sensitive to differences in item heterogeneity, as we should expect to collect larger samples if we wish to have a large number of items reach a threshold of acceptable variance; potentially, assuring we could average them if a researcher did not wish to use a more complex analysis such as multilevel modeling.

The figure below illustrates the potential minimum sample size for 80% of items to achieve a desired cutoff score. The black dots denote the original sample size against the suggested sample size. By comparing the facets, we can determine that our suggested procedure does capture the differences in heterogeneity. As heterogeneity increases in item variances, the proposed sample size also increases, especially at stricter cutoffs. Missing cutoff points where sample sizes proposed would be higher than 500.

summary_long$source <- factor(summary_long$source, 
                              levels = c("Low Variance", "Medium Variance", "High Variance"))

ggplot(summary_long %>% filter(power == 80), aes(original_n, sample_size, color = name)) + 
  geom_point() +  
  geom_point(aes(original_n, original_n), color = "black") + 
  geom_line() + 
  theme_classic() + 
  facet_wrap(~ source*scale2) + 
  xlab("Original Sample Size") + 
  ylab("Suggested Sample Size") + 
  scale_color_discrete(name = "Cutoff Score")

Sample Size Sensitivity to Pilot Data Size

In our second question, we examined if the suggested procedure was sensitive to the amount of information present in the pilot data. Larger pilot data is more informative, and therefore, we should expect a lower projected sample size. As shown in the figure below for only the low variability and small scale data, we do not find this effect. These simplistic simulations from the pilot data would nearly always suggest a larger sample size - mostly in a linear trend increasing with sample sizes. This result comes from the nature of the procedure - if we base our estimates on some SE cutoff, we will almost always need a bit more people for items to meet those goals. This result does not achieve our second goal.

ggplot(summary_long %>% filter(source == "Low Variance") %>% filter(scale == 1), aes(original_n, sample_size, color = name)) + 
  geom_point() +  
  geom_point(aes(original_n, original_n), color = "black") + 
  geom_line() + 
  theme_classic() + 
  facet_wrap(~ power) + 
  xlab("Original Sample Size") + 
  ylab("Suggested Sample Size") + 
  scale_color_discrete(name = "Cutoff Score")

Therefore, we suggest using a correction factor on the simulation procedure to account for the known asymptotic nature of power (i.e., at larger sample sizes power increases level off). For this function, we combined a correction factor for upward biasing of effect sizes (Hedges’ correction) with the formula for exponential decay calculations. The decay factor is calculated as follows:

\[ 1 - \sqrt{\frac{N_{pilot} - min(N_{sim})}{N_{pilot}}}^{log_2(N_{pilot})}\] \(N_{pilot}\) indicates the sample size of the pilot data minus the minimum sample size for simulation to ensure that the smallest sample sizes do not decay (e.g., the formula zeroes out). This value is raised to the power of \(log_2\) of the sample size of the pilot data, which decreases the impact of the decay to smaller increments for increasing sample sizes. This value is then multiplied by the proposed sample size. As show in the figure below, this correction factor produces the desired quality of maintaining that small pilot studies should increase sample size, and that sample size suggestions level off as pilot study data sample size increases.

decay <- 1-sqrt((summary_long$original_n-20)/summary_long$original_n)^log2(summary_long$original_n)

summary_long$new_sample <- summary_long$sample_size*decay

ggplot(summary_long %>% filter(source == "Low Variance") %>% filter(scale == 1), aes(original_n, new_sample, color = name)) + 
  geom_point() +  
  geom_point(aes(original_n, original_n), color = "black") + 
  geom_line() + 
  theme_classic() + 
  facet_wrap(~ power) + 
  xlab("Original Sample Size") + 
  ylab("Suggested Sample Size") + 
  scale_color_discrete(name = "Cutoff Score")

Corrections for Individual Researchers

We have portrayed that this procedure, with a correction factor, can perform as desired. However, within real scenarios, researchers will only have one pilot sample, not the various simulated samples shown above. What should the researcher do to correct their sample size on their own pilot data?

To explore if we could recover the new suggested sample size from data a researcher would have, we used linear models to create a formula for calculation. First, the corrected sample size was predicted by the original suggested sample size. Next, the standard deviation of the item standard deviations was added to the equation to recreate heterogeneity estimates. Last, we included the pilot sample size. The scale of the data is embedded into the standard deviation of the items (r = 0.81), and therefore, this variable was not included separately. The standard deviation of the item’s standard deviation embeds both heterogeneity and potential scale size.

user_model <- lm(new_sample ~ sample_size, data = summary_long)
user_print <- apa_print(user_model)

user_model2 <- lm(new_sample ~ sample_size + sd_item, data = summary_long)
user_print2 <- apa_print(user_model2)

user_model3 <- lm(new_sample ~ sample_size + sd_item + original_n, data = summary_long)
user_print3 <- apa_print(user_model3)

change_table <- tidy(anova(user_model, user_model2, user_model3))

The first model using original sample size to predict new sample size was significant, \(R^2 = .89\), 90% CI \([0.89, 0.90]\), \(F(1, 5,666) = 48,042.64\), \(p < .001\), capturing nearly 90% of the variance, \(b = 0.62\), 95% CI \([0.62, 0.63]\). The second model with item standard deviation was better then the first model F(1, 5665) = 55.21, p < .001, \(R^2 = .89\), 90% CI \([0.89, 0.90]\). The item standard deviation predictor was significant, \(b = 0.02\), 95% CI \([0.01, 0.03]\), \(t(5665) = 4.54\), \(p < .001\). The addition of the original pilot sample size was also significant, F(1, 5664) = 9529.83, p < .001, \(R^2 = .96\), 90% CI \([0.96, 0.96]\).

As shown in the table below for our final model, the new suggested sample size is proportional to the original suggested sample size (i.e., b < 1), which reduces the sample size suggestion. As variability increases, the suggested sample size also increases to capture differences in heterogeneity shown above; however, this predictor is not significant in the final model, and only contributes a small portion of overall variance. Last, in order to correct for large pilot data, the original pilot sample size decreases the new suggested sample size. This formula approximation captures 96% of the variance in sample size scores and should allow a researcher to estimate based on their own data.

apa_table(tidy(user_model3))
(#tab:unnamed-chunk-12)
**
term estimate std.error statistic p.value
(Intercept) 39.27 0.44 89.84 0.00
sample_size 0.70 0.00 366.67 0.00
sd_item 0.00 0.00 0.95 0.34
original_n -0.69 0.01 -97.62 0.00

Choosing an Appropriate Cutoff

by_cutoff <- list()
R2 <- list()

for (cutoff in unique(summary_long$name)){
  by_cutoff[[cutoff]] <- lm(new_sample ~ sample_size + sd_item + original_n, data = summary_long  %>% filter(name == cutoff))
  R2[cutoff] <- summary(by_cutoff[[cutoff]])$r.squared
}

R2
## $`Decile 0`
## [1] 0.966061
## 
## $`Decile 10`
## [1] 0.9631429
## 
## $`Decile 20`
## [1] 0.9671475
## 
## $`Decile 30`
## [1] 0.9675296
## 
## $`Decile 40`
## [1] 0.9669199
## 
## $`Decile 50`
## [1] 0.9647775
## 
## $`Decile 60`
## [1] 0.9642954
## 
## $`Decile 70`
## [1] 0.9642689
## 
## $`Decile 80`
## [1] 0.9624604
## 
## $`Decile 90`
## [1] 0.9650027

Last, we examine the question of an appropriate SE decile. All graphs for power, heterogeneity, scale, and correction are presented below. First, the 0%, 10%, and 20% deciles are likely too restrictive, providing very large estimates that do not always find a reasonable sample size in proportion to the pilot sample size, scale, and heterogeneity.If we examine the \(R^2\) values for each decile of our regression equation separately, we find that the 50% (0.965) represents the best match to our corrected sample size suggestions. The 50% decile, in the corrected format, appears to meet all goals: 1) increases with heterogeneity and scale of data, and 2) higher suggested values for small original samples and a leveling effect at larger pilot data.

The formula for finding the corrected sample size using a 50% decile is: \(Final Sample = 39.269 + 0.700 \times X_{Suggested Sample Size} + 0.003 \times X_{SD Items} + -0.695 \times X_{Pilot Sample Size}\). The suggested sample size will be estimated from the 80%, 85%, 90%, or 95% selection at the 50% decile as shown above. The item SD can be calculated directly from the data, and the pilot sample size is the sample size of the data from which a researcher is simulating their samples.

Conclusions

  1. Take your original pilot data
  2. Find a critical score for an accurately measured item using SE at the 50% decile of the item SE values.
  3. Sample from your pilot data using sample sizes starting at 20 participants and increase in units of 5 up to a value you feel is the maximum sample size.
  4. Calculate the number of items that meet your critical score at each sample size.
  5. Find the smallest sample size for 80%, 85%, 90%, and 95% power.
  6. Apply the correction to the smallest sample sizes.
  7. Designate your minimum sample size from these values, the stopping rule used from step 2, and the maximum sample size from these values.

Low Variability

ggplot(summary_long %>% filter(source == "Low Variance"), aes(original_n, sample_size, color = name)) + 
  geom_point() +  
  geom_point(aes(original_n, original_n), color = "black") + 
  geom_line() + 
  theme_classic() + 
  facet_wrap(~ scale2*power) + 
  xlab("Original Sample Size") + 
  ylab("Suggested Sample Size") + 
  scale_color_discrete(name = "Cutoff Score")

ggplot(summary_long %>% filter(source == "Low Variance"), aes(original_n, new_sample, color = name)) + 
  geom_point() +  
  geom_point(aes(original_n, original_n), color = "black") + 
  geom_line() + 
  theme_classic() + 
  facet_wrap(~ scale2*power) + 
  xlab("Original Sample Size") + 
  ylab("Suggested Sample Size") + 
  scale_color_discrete(name = "Cutoff Score")

Medium Varability

ggplot(summary_long %>% filter(source == "Medium Variance"), aes(original_n, sample_size, color = name)) + 
  geom_point() +  
  geom_point(aes(original_n, original_n), color = "black") + 
  geom_line() + 
  theme_classic() + 
  facet_wrap(~ scale2*power) + 
  xlab("Original Sample Size") + 
  ylab("Suggested Sample Size") + 
  scale_color_discrete(name = "Cutoff Score")

summary_long$new_sample <- summary_long$sample_size*decay

ggplot(summary_long %>% filter(source == "Medium Variance"), aes(original_n, new_sample, color = name)) + 
  geom_point() +  
  geom_point(aes(original_n, original_n), color = "black") + 
  geom_line() + 
  theme_classic() + 
  facet_wrap(~ scale2*power) + 
  xlab("Original Sample Size") + 
  ylab("Suggested Sample Size") + 
  scale_color_discrete(name = "Cutoff Score")

High Variability

ggplot(summary_long %>% filter(source == "High Variance"), aes(original_n, sample_size, color = name)) + 
  geom_point() +  
  geom_point(aes(original_n, original_n), color = "black") + 
  geom_line() + 
  theme_classic() + 
  facet_wrap(~ scale2*power) + 
  xlab("Original Sample Size") + 
  ylab("Suggested Sample Size") + 
  scale_color_discrete(name = "Cutoff Score")

ggplot(summary_long %>% filter(source == "High Variance"), aes(original_n, new_sample, color = name)) + 
  geom_point() +  
  geom_point(aes(original_n, original_n), color = "black") + 
  geom_line() + 
  theme_classic() + 
  facet_wrap(~ scale2*power) + 
  xlab("Original Sample Size") + 
  ylab("Suggested Sample Size") + 
  scale_color_discrete(name = "Cutoff Score")